# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
import chart_studio.plotly as py
#import plotly.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
from sklearn.cluster import KMeans
#initiate visualization library for jupyter notebook
pyoff.init_notebook_mode()
import os
from sklearn.cluster import KMeans
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
# define the derictories where all dump
os.chdir(r'C:\Users\mainak.kundu\Desktop\REVOLT')
# read the data
rev_user = pd.read_csv('rev-users.csv')
rev_trns = pd.read_csv('rev-transactions.csv')
rev_dvc = pd.read_csv('rev-devices.csv')
rev_notfc = pd.read_csv('rev-notifications.csv')
# shape the data
print('user_data:',rev_user.shape)
print('transaction_data:',rev_trns.shape)
print('device_data:',rev_dvc.shape)
print('notification_data:',rev_notfc.shape)
#converting the type of Invoice Date Field from string to datetime.
rev_notfc.rename(columns = {'created_date':'notification_date'}, inplace = True) ## change the column name of created_date
rev_trns.rename(columns = {'created_date':'transaction_date'}, inplace = True) ## change the column name of created_date
rev_trns['transaction_date'] = pd.to_datetime(rev_trns['transaction_date'])
#creating YearMonth field for the ease of reporting and visualization
rev_trns['yr_mnth'] = rev_trns['transaction_date'].map(lambda date: 100*date.year + date.month)
from matplotlib import rcParams
# figure size in inches
rcParams['figure.figsize'] = 11.7,8.27
tx_revenue = rev_trns.groupby(['yr_mnth'])['amount_usd'].mean().reset_index()
sns.barplot(tx_revenue['yr_mnth'],tx_revenue['amount_usd'])
tx_revenue.head(2)
# Monthly active user
#creating monthly active customers dataframe by counting unique Customer IDs
tx_monthly_active = rev_trns.groupby('yr_mnth')['user_id'].nunique().reset_index()
#print the dataframe
tx_monthly_active
#plotting the output
plot_data = [
go.Bar(
x=tx_monthly_active['yr_mnth'],
y=tx_monthly_active['user_id'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Active Customers'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#create a dataframe contaning CustomerID and first purchase date
tx_min_purchase = rev_trns.groupby('user_id').transaction_date.min().reset_index()
tx_min_purchase.columns = ['user_id','min_purchase_date']
tx_min_purchase['min_purchase_date'] = tx_min_purchase['min_purchase_date'].map(lambda date: 100*date.year + date.month)
rev_trns = pd.merge(rev_trns, tx_min_purchase, on='user_id')
rev_trns.head()
#create a column called User Type and assign Existing
#if User's First Purchase Year Month before the selected Invoice Year Month
rev_trns['user_type'] = 'New'
rev_trns.loc[rev_trns['yr_mnth']>rev_trns['min_purchase_date'],'user_type'] = 'Existing'
#calculate the Revenue per month for each user type
tx_user_type_revenue = rev_trns.groupby(['yr_mnth','user_type'])['amount_usd'].mean().reset_index()
tx_user_type_revenue.head(2)
sns.pointplot(tx_user_type_revenue['yr_mnth'],tx_user_type_revenue['amount_usd'],hue=tx_user_type_revenue['user_type'])
#create a dataframe that shows new user ratio - we also need to drop NA values (first month new user ratio is 0)
tx_user_ratio = rev_trns.query("user_type == 'New'").groupby(['yr_mnth'])['user_id'].nunique()/rev_trns.query("user_type == 'Existing'").groupby(['yr_mnth'])['user_id'].nunique()
tx_user_ratio = tx_user_ratio.reset_index()
tx_user_ratio = tx_user_ratio.dropna()
#figure(num=None, figsize=(15,15), dpi=48, facecolor='w', edgecolor='k')
sns.barplot(tx_user_ratio['yr_mnth'],tx_user_ratio['user_id'])
#identify which users are active by looking at their revenue per month
tx_user_purchase = rev_trns.groupby(['user_id','yr_mnth'])['amount_usd'].sum().reset_index()
#create retention matrix with crosstab
tx_retention = pd.crosstab(tx_user_purchase['user_id'], tx_user_purchase['yr_mnth']).reset_index()
tx_retention.head()
#create an array of dictionary which keeps Retained & Total User count for each month
months = tx_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
retention_data = {}
selected_month = months[i+1]
prev_month = months[i]
retention_data['yr_mnth'] = int(selected_month)
retention_data['TotalUserCount'] = tx_retention[selected_month].sum()
retention_data['RetainedUserCount'] = tx_retention[(tx_retention[selected_month]>0) & (tx_retention[prev_month]>0)][selected_month].sum()
retention_array.append(retention_data)
#convert the array to dataframe and calculate Retention Rate
tx_retention = pd.DataFrame(retention_array)
tx_retention['RetentionRate'] = tx_retention['RetainedUserCount']/tx_retention['TotalUserCount']
#plot the retention rate graph
plot_data = [
go.Scatter(
x=tx_retention['yr_mnth'],
y=tx_retention['RetentionRate'],
name="organic"
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Retention Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_user_purchase = rev_trns.groupby(['user_id','yr_mnth'])['amount_usd'].sum().astype(int).reset_index()
tx_retention = pd.crosstab(tx_user_purchase['user_id'],tx_user_purchase['yr_mnth']).reset_index()
tx_retention.head()
#create a generic user dataframe to keep CustomerID and new segmentation scores
tx_user = pd.DataFrame(rev_trns['user_id'].unique())
tx_user.columns = ['user_id']
#get the max purchase date for each customer and create a dataframe with it
tx_max_purchase = rev_trns.groupby('user_id').transaction_date.max().reset_index()
tx_max_purchase.columns = ['user_id','max_purchase_date']
#we take our observation point as the max invoice date in our dataset
tx_max_purchase['Recency'] = (tx_max_purchase['max_purchase_date'].max() - tx_max_purchase['max_purchase_date']).dt.days
#merge this dataframe to our new user dataframe
tx_user = pd.merge(tx_user, tx_max_purchase[['user_id','Recency']], on='user_id')
tx_user.head()
tx_user['Recency'].describe()
from sklearn.cluster import KMeans
sse={}
tx_recency = tx_user[['Recency']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
tx_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show() ## 2 or 3(max) clusters are enough
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
#see details of each cluster
tx_user.groupby('RecencyCluster')['Recency'].describe()
#get order counts for each user and create a dataframe with it
tx_frequency = rev_trns.groupby('user_id').transaction_date.count().reset_index()
tx_frequency.columns = ['user_id','Frequency']
#add this data to our main dataframe
tx_user = pd.merge(tx_user, tx_frequency, on='user_id')
#k-means
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
#order the frequency cluster
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)
#see details of each cluster
tx_user.groupby('FrequencyCluster')['Frequency'].describe()
#calculate revenue for each customer
tx_revenue = rev_trns.groupby('user_id').amount_usd.sum().reset_index()
#merge it with our main dataframe
tx_user = pd.merge(tx_user, tx_revenue, on='user_id')
tx_user.head()
#apply clustering
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['amount_usd']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['amount_usd']])
#order the cluster numbers
tx_user = order_cluster('RevenueCluster', 'amount_usd',tx_user,True)
#show details of the dataframe
tx_user.groupby('RevenueCluster')['amount_usd'].describe()
#calculate overall score and use mean() to see details
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user.groupby('OverallScore')['Recency','Frequency','amount_usd'].mean()
tx_user['segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>=1,'segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>3,'segment'] = 'High-Value'
tx_user.groupby('segment')['Recency','Frequency','amount_usd'].mean()
sns.scatterplot(tx_user['Recency'],tx_user['Frequency'],hue=tx_user['segment'])
sns.scatterplot(tx_user['Frequency'],tx_user['amount_usd'],hue=tx_user['segment'])
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
rev_trns['transaction_date'].max(),rev_trns['transaction_date'].min()
#create 12m and 5m dataframes
# Put a filter and segregate last 2 months data who are transacted
start_date = '2018-01-01'
end_date = '2018-12-30'
mask = (rev_trns['transaction_date'] > start_date) & (rev_trns['transaction_date'] <= end_date)
tx_12m = rev_trns.loc[mask]
tx_12m.shape
start_date = '2019-01-01'
end_date = '2019-05-16'
mask = (rev_trns['transaction_date'] > start_date) & (rev_trns['transaction_date'] <= end_date)
tx_5m = rev_trns.loc[mask]
tx_5m.shape
#create tx_user for assigning clustering
tx_user = pd.DataFrame(tx_12m['user_id'].unique())
tx_user.columns = ['user_id']
#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
tx_12m.head(2)
#calculate recency score
tx_max_purchase = tx_12m.groupby('user_id').transaction_date.max().reset_index()
tx_max_purchase.columns = ['user_id','MaxPurchaseDate']
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
tx_user = pd.merge(tx_user, tx_max_purchase[['user_id','Recency']], on='user_id')
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
tx_user.head(2)
#calcuate frequency score
tx_frequency = tx_12m.groupby('user_id').transaction_date.count().reset_index()
tx_frequency.columns = ['user_id','Frequency']
tx_user = pd.merge(tx_user, tx_frequency, on='user_id')
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)
tx_user.head(2)
#calcuate revenue score
tx_12m['Revenue'] = tx_12m['amount_usd']
tx_revenue = tx_12m.groupby('user_id').Revenue.sum().reset_index()
tx_user = pd.merge(tx_user, tx_revenue, on='user_id')
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)
tx_user.head(2)
#overall scoring
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user['segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>3,'segment'] = 'High-Value'
tx_user.segment.value_counts()
(359/17466)*100
#calculate revenue and create a new dataframe for it
tx_5m['Revenue'] = tx_5m['amount_usd']
tx_user_5m = tx_5m.groupby('user_id')['Revenue'].sum().reset_index()
tx_user_5m.columns = ['user_id','m5_Revenue']
tx_user_5m.head()
tx_merge = pd.merge(tx_user, tx_user_5m, on='user_id', how='left')
tx_merge = tx_merge.fillna(0)
tx_merge.head()
#remove outliers
tx_merge = tx_merge[tx_merge['m5_Revenue']<tx_merge['m5_Revenue'].quantile(0.99)]
#creating 3 clusters
kmeans = KMeans(n_clusters=2)
kmeans.fit(tx_merge[['m5_Revenue']])
tx_merge['LTVCluster'] = kmeans.predict(tx_merge[['m5_Revenue']])
#order cluster number based on LTV
tx_merge = order_cluster('LTVCluster', 'm5_Revenue',tx_merge,True)
#creatinga new cluster dataframe
tx_cluster = tx_merge.copy()
#see details of the clusters
tx_cluster.groupby('LTVCluster')['m5_Revenue'].describe()
tx_cluster['LTVCluster'].value_counts()
(981/17646)*100 ### distribution is highly skewed
def trans_after_notified(rev_notfc,rev_trns):
'''
This function will take the transaction & notifcation merge data and tagged the customers
who are responded after 7 days of notification sent
'''
df = rev_notfc.merge(rev_trns,how='left',on='user_id') ## notif + transac
## convert it to date format
df['notification_date'] = pd.to_datetime(df['notification_date'], infer_datetime_format=True)
df['transaction_date'] = pd.to_datetime(df['transaction_date'], infer_datetime_format=True)
## date difference between notification sent and transaction happened and lastly into days
df['diff_trans_from_notified'] = df['notification_date'] - df['transaction_date']
df['diff_trans_from_notified'] = df['diff_trans_from_notified'] / np.timedelta64(1, 'D')
## create column with after notification transaction happened in 7 days
df["after_notification_trans_in_7_days"]=np.where((-1 > df["diff_trans_from_notified"])&(df["diff_trans_from_notified"]>=-7),1,0)
## do groupby and create a categorical column
df_grp = df.groupby('user_id')['after_notification_trans_in_7_days'].sum().reset_index()
print(df_grp.head(2))
df_grp['is_trans'] = np.where(df_grp['after_notification_trans_in_7_days']>0,1,0)
df_grp = df_grp[['user_id','after_notification_trans_in_7_days']] ## final data
return df_grp
def dummy_grp_features(rev_trns,rev_dvc,rev_notfc,rev_user):
'''
This function took all the data and do grouping on transaction data,
and create a dummy columns and bind back to unique transaction data
and join with other sources of data like users,device,notification
'''
rev_trns_1 = pd.get_dummies(rev_trns['transactions_state'])
rev_trns_2 = pd.get_dummies(rev_trns['direction'])
rev_trns_grp = pd.concat([rev_trns.reset_index(drop=True), rev_trns_1,rev_trns_2], axis=1)
rev_trns_grp1 = rev_trns.groupby('user_id')['transaction_id'].count().reset_index()
rev_trns_grp1.rename(columns={'transaction_id': 'count_of transaction_id'},inplace=True)
rev_trns_grp = rev_trns_grp.groupby('user_id')[['CANCELLED','COMPLETED','DECLINED','FAILED','PENDING','REVERTED']].sum().reset_index()
rev_trns_grp = rev_trns_grp1.merge(rev_trns_grp,how='left',on='user_id') ## join transaction data
print('--- Grouping on Transaction data complete----')
rev_notfc_1 = pd.get_dummies(rev_notfc['reason'])
rev_notfc_2 = pd.get_dummies(rev_notfc['channel'])
rev_notfc_3 = pd.get_dummies(rev_notfc['status'])
rev_notfc_grp = pd.concat([rev_notfc.reset_index(drop=True), rev_notfc_1,rev_notfc_2,rev_notfc_3], axis=1)
rev_notfc_grp1 = rev_notfc.groupby('user_id')['notification_date'].count().reset_index() ## count of notifivation sent to users
rev_notfc_grp1.rename(columns={'notification_date': 'count_of notification_sent'},inplace=True)
rev_notfc_grp = rev_notfc_grp.groupby('user_id')[['BLACK_FRIDAY','ENGAGEMENT_SPLIT_BILL_RESTAURANT',
'FIFTH_PAYMENT_PROMO','JOINING_ANNIVERSARY','LOST_CARD_ORDER',
'MADE_MONEY_REQUEST_NOT_SPLIT_BILL','METAL_GAME_START','METAL_RESERVE_PLAN',
'NO_INITIAL_CARD_ORDER','NO_INITIAL_CARD_USE','NO_INITIAL_FREE_PROMOPAGE_CARD_ORDER',
'ONBOARDING_TIPS_ACTIVATED_USERS','PREMIUM_ENGAGEMENT_FEES_SAVED','PREMIUM_ENGAGEMENT_INACTIVE_CARD',
'PUMPKIN_PAYMENT_NOTIFICATION','REENGAGEMENT_ACTIVE_FUNDS','WELCOME_HOME','EMAIL','PUSH','SMS','FAILED','SENT']].sum().reset_index()
rev_notfc_grp = rev_notfc_grp1.merge(rev_notfc_grp,how='left',on='user_id') ## join notification data
r1 = rev_user.merge(rev_dvc,how='left',on='user_id')
r2 = r1.merge(rev_notfc_grp,how='left',on='user_id')
r3 = r2.merge(rev_trns_grp,how='left',on='user_id')
print('--- Merging Done, data prep done ---')
return r3
def feature_engineering_pipeline(rev_notfc_tr,rev_trns_tr,rev_user,rev_dvc):
'''
Feature pipeline having all the function call for creating new features
'''
df1 = trans_after_notified(rev_notfc,rev_trns) ## within 7 days feature create
df_grp = dummy_grp_features(rev_trns_tr,rev_dvc,rev_notfc_tr,rev_user) ## all group by things will happen
df_fnl = df_grp.merge(df1,how='left',on='user_id') ## all features concatenated
return df_fnl
start_date = '2018-01-01'
end_date = '2018-12-30'
mask = (rev_trns['transaction_date'] > start_date) & (rev_trns['transaction_date'] <= end_date)
trns_12 = rev_trns.loc[mask] ## this data is the time data
notfc_12 = rev_notfc.loc[mask] ## this data is the time data
print('12 months of transaction & Notification data:',trns_12.shape,notfc_12.shape)
tx_feature_df = feature_engineering_pipeline(notfc_12,trns_12,rev_user,rev_dvc) ## grp features done
# since how many days user is with app
max_date = rev_trns['transaction_date'].max()
tx_feature_df['created_date'] = pd.to_datetime(tx_feature_df['created_date'])
tx_feature_df['days_since_app'] = max_date - tx_feature_df['created_date']
tx_feature_df['days_since_app'] = tx_feature_df['days_since_app'] / np.timedelta64(1, 'D')
## person's age who is using the app
tx_feature_df['age']=2020 -tx_feature_df['birth_year']
tx_feature_df.head(2)
sns.distplot(tx_feature_df['age']) ## Normally distributed
sns.distplot(tx_feature_df['days_since_app']) ## majorly old users
print(tx_cluster.head(2)) ## rfm data
print(tx_feature_df.head(2)) ## other features
print(tx_cluster.shape,tx_feature_df.shape)
## merge everything
final_df = tx_feature_df.merge(tx_cluster,how='inner',on='user_id')
final_df.shape
final_df.head(2)
final_df['failed_notifc'] = final_df['FAILED_x']+final_df['FAILED_y']
final_df.columns
final_df['LTVCluster'].value_counts()
FEATURES = ['user_id','user_settings_crypto_unlocked','plan','attributes_notifications_marketing_push',
'attributes_notifications_marketing_email','num_contacts','num_referrals', 'num_successful_referrals','brand',
'count_of notification_sent','BLACK_FRIDAY','ENGAGEMENT_SPLIT_BILL_RESTAURANT', 'FIFTH_PAYMENT_PROMO',
'JOINING_ANNIVERSARY', 'LOST_CARD_ORDER','MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_GAME_START',
'METAL_RESERVE_PLAN', 'NO_INITIAL_CARD_ORDER', 'NO_INITIAL_CARD_USE',
'NO_INITIAL_FREE_PROMOPAGE_CARD_ORDER','ONBOARDING_TIPS_ACTIVATED_USERS', 'PREMIUM_ENGAGEMENT_FEES_SAVED',
'PREMIUM_ENGAGEMENT_INACTIVE_CARD', 'PUMPKIN_PAYMENT_NOTIFICATION',
'REENGAGEMENT_ACTIVE_FUNDS', 'WELCOME_HOME', 'EMAIL', 'PUSH', 'SMS','PENDING', 'REVERTED',
'SENT', 'count_of transaction_id', 'CANCELLED', 'COMPLETED','failed_notifc','after_notification_trans_in_7_days',
'days_since_app', 'age','RecencyCluster','FrequencyCluster','RevenueCluster','LTVCluster']
## X,y
df = final_df[FEATURES]
X = df.drop(['user_id','LTVCluster'],axis=1)
y = df['LTVCluster']
y.value_counts()
(981/16665)*100
X.head(2)
X.describe()
## grab categorical data
categorical_features = X.select_dtypes(include=['object'])
categorical_features = categorical_features.columns
## Encode the object data
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
categorical_features
# Categorical boolean mask
categorical_feature_mask = X.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()
categorical_cols
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()
# apply le on categorical feature columns
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))
X[categorical_cols].head(2)
## fill the missing values with 0 (very naive way )
X.fillna(0,inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print('X_train & X_test shape:',X_train.shape,X_test.shape)
print('y_train & y_test distribution shape:',y_train.value_counts(),y_test.value_counts())
df_x = X_train
df_x['LTVCluster'] = y_train
# Class count
count_class_0, count_class_1 = df_x.LTVCluster.value_counts()
# Divide by class
df_class_0 = df_x[df_x['LTVCluster'] == 0]
df_class_1 = df_x[df_x['LTVCluster'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
print('Random over-sampling:')
print(df_test_over.LTVCluster.value_counts())
df_test_over.LTVCluster.value_counts().plot(kind='bar', title='Count (target)');
X_train_smp = df_test_over.drop(['LTVCluster'],axis=1)
y_train_smp = df_test_over['LTVCluster']
print('After over sampling new training data:',X_train_smp.shape,y_train_smp.value_counts())
# find and remove correlated features
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
return col_corr
corr_features = correlation(X_train_smp, 0.8)
print('correlated features: ', len(set(corr_features)) )
# removed correlated features
X_train_smp.drop(labels=corr_features, axis=1, inplace=True)
X_test_smp.drop(labels=corr_features, axis=1, inplace=True)
# keep a copy of the dataset at this stage
X_train_corr = X_train_smp.copy()
X_test_corr = X_test_smp.copy()
# find important features using univariate roc-auc
# loop to build a tree, make predictions and get the roc-auc
# for each feature of the train set
roc_values = []
for feature in X_train_corr.columns:
clf = DecisionTreeClassifier()
clf.fit(X_train_smp[feature].fillna(0).to_frame(), y_train_smp)
y_scored = clf.predict_proba(X_test_smp[feature].fillna(0).to_frame())
roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
# let's add the variable names and order it for clearer visualisation
roc_values = pd.Series(roc_values)
roc_values.index = X_train_smp.columns
roc_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))
# select features using the impotance derived from
# random forests
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=400))
sel_.fit(X_train_smp, y_train_smp)
# remove features with zero coefficient from dataset
# and parse again as dataframe (output of sklearn is
# numpy array)
X_train_rf = pd.DataFrame(sel_.transform(X_train_smp))
X_test_rf = pd.DataFrame(sel_.transform(X_test_smp))
# add the columns name
X_train_rf.columns = X_train_smp.columns[(sel_.get_support())]
X_test_rf.columns = X_train_smp.columns[(sel_.get_support())]
print(X_train_rf.shape,X_test_rf.shape)
y_test.value_counts()
def model_builder(algo,X_train,y_train,X_test):
algo.fit(X_train,y_train)
y_pred = algo.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))
print('==ROC-AUC==:',roc_auc_score(y_test,y_pred))
return y_pred
# Scale for Logistic Regression
scaler = MinMaxScaler()
X_train_mt = scaler.fit_transform(X_train_smp)
X_test_mt = scaler.fit_transform(X_test_smp)
## all set of features we want Logistic (origial set of features)
algo = LogisticRegression(penalty='l1')
model_builder(algo,X_train_smp,y_train_smp,X_test)
## without corr
model_builder(algo,X_train_corr,y_train_smp,X_test_corr)
## rf
model_builder(algo,X_train_rf,y_train_smp,X_test_rf)
algo=xgb.XGBClassifier(max_depth=5, learning_rate=0.1,n_jobs=-1)
_ = model_builder(algo,X_train_rf,y_train_smp,X_test_rf)
algo = RandomForestClassifier(max_depth=3)
model_builder(algo,X_train_rf,y_train_smp,X_test_rf)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print('X_train & X_test shape:',X_train.shape,X_test.shape)
print('y_train & y_test distribution shape:',y_train.value_counts(),y_test.value_counts())
# select features using the impotance derived from
# random forests
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=400))
sel_.fit(X_train, y_train)
# remove features with zero coefficient from dataset
# and parse again as dataframe (output of sklearn is
# numpy array)
X_train_rf = pd.DataFrame(sel_.transform(X_train))
X_test_rf = pd.DataFrame(sel_.transform(X_test))
# add the columns name
X_train_rf.columns = X_train.columns[(sel_.get_support())]
X_test_rf.columns = X_train.columns[(sel_.get_support())]
print(X_train_rf.shape,X_test_rf.shape)
# find important features using univariate roc-auc
# loop to build a tree, make predictions and get the roc-auc
# for each feature of the train set
roc_values = []
for feature in X_train.columns:
clf = DecisionTreeClassifier()
clf.fit(X_train[feature].fillna(0).to_frame(), y_train)
y_scored = clf.predict_proba(X_test[feature].fillna(0).to_frame())
roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
# let's add the variable names and order it for clearer visualisation
roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
roc_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))
y_train.value_counts()
scale_pos_weight = 13347/769
scale_pos_weight
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
np.unique(y_train),
y_train)
class_weights
## rf
algo = RandomForestClassifier(max_depth=3,class_weight={0:0.52880797,1:9.17815345})
model_builder(algo,X_train_rf,y_train,X_test_rf)
algo = LogisticRegression(penalty='l1',class_weight={0:0.52880797,1:9.17815345})
model_builder(algo,X_train_rf,y_train,X_test_rf)
### Predict test_y values and probabilities based on fitted logistic
#regression model
pred_y=algo.predict(X_test_rf)
probs_y=algo.predict_proba(X_test_rf)
from sklearn.metrics import accuracy_score
precision, recall, thresholds = precision_recall_curve(y_test, probs_y[:,
1])
#retrieve probability of being 1(in second column of probs_y)
#pr_auc = metrics.auc(recall, precision)
plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[: -1], "b--", label="Precision")
plt.plot(thresholds, recall[: -1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])
1.Group A will exposed to offer and on High Retention
2.Group B will not exposed to offer and Low Retention
This also helps us to test model accuracy as well. If group B’s retention rate is 50%, it clearly shows that our model is not working.
Sucess_Metric = Retention rate
df_hv = pd.DataFrame()
df_hv['customer_id'] = np.array([count for count in range(20000)])
df_hv['segment'] = np.array(['high-value' for _ in range(20000)])
df_hv['group'] = 'control'
df_hv.loc[df_hv.index<10000,'group'] = 'test'
df_hv.head(2)
df_hv.loc[df_hv.group == 'test', 'purchase_count'] = np.random.poisson(0.6, 10000)
df_hv.loc[df_hv.group == 'control', 'purchase_count'] = np.random.poisson(0.5, 10000)
test_results = df_hv[df_hv.group == 'test'].purchase_count
control_results = df_hv[df_hv.group == 'control'].purchase_count
hist_data = [test_results, control_results]
group_labels = ['test', 'control']
H1:Significant difference between the groups
from scipy import stats
def eval_test(test_results,control_results):
test_result = stats.ttest_ind(test_results, control_results)
if test_result[1] < 0.05:
print('result is significant')
else:
print('result is not significant')
eval_test(test_results,control_results)